project.dir = paste0(params$working_dir, params$project_name)
if(!dir.exists(project.dir)){dir.create(project.dir)}
data.dir = paste0(project.dir, "/data")
if(!dir.exists(data.dir)){dir.create(data.dir)}
results.dir = paste0(project.dir, "/results")
if(!dir.exists(results.dir)){dir.create(results.dir)}
if(params$goi != "none"){
gene.dir = paste0(results.dir, "/", params$goi, "_expressing_cells")
} else {
gene.dir = paste0(results.dir, "/all_cells")}
if(!dir.exists(gene.dir)){dir.create(gene.dir)}
knitr::opts_knit$set(root.dir = params$working_dir)
set.seed(1)
source(paste0(project.dir, "/src/scRNA_seq_functions.R"))
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(Seurat)
## Attaching SeuratObject
## Attaching sp
library(patchwork)
library(clustree)
## Loading required package: ggraph
##
## Attaching package: 'ggraph'
##
## The following object is masked from 'package:sp':
##
## geometry
library(filesstrings)
##
## Attaching package: 'filesstrings'
##
## The following object is masked from 'package:dplyr':
##
## all_equal
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
load(paste0(data.dir, "/", params$seuratOb))
seuratOb
## An object of class Seurat
## 22405 features across 4570 samples within 1 assay
## Active assay: RNA (22405 features, 0 variable features)
seuratOb = SCTransform(seuratOb,
method = "glmGamPoi",
vars.to.regress = "percent.mt",
verbose = FALSE)
A list of human cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat. These have been converted to the mouse version of the genes. We'll use the expression of these genes to estimate the cell-cycle phase of each cell.
# This is how I got the cc genes, but it takes too long to run everytime
# https://satijalab.org/seurat/archive/v3.1/cell_cycle_vignette.html
# A list of human cell cycle markers, from Tirosh et al, 2015, is loaded with Seurat.
# We can segregate this list into markers of G2/M phase and markers of S phase
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
# Basic function to convert human to mouse gene names
convertHumanGeneList <- function(x){
require("biomaRt")
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
genesV2 = getLDS(attributes = c("hgnc_symbol"),
filters = "hgnc_symbol",
values = x ,
mart = human,
attributesL = c("mgi_symbol"),
martL = mouse,
uniqueRows=T)
return(genesV2)
}
s.mouse = convertHumanGeneList(s.genes)
s.mouse = s.mouse$MGI.symbol
g2m.mouse = convertHumanGeneList(g2m.genes)
g2m.mouse = g2m.mouse$MGI.symbol
s.mouse = c("Exo1", "Msh2","Mcm4","Gmnn","Rrm2",
"Chaf1b","Mcm2","Slbp","Hells", "Cdc6",
"Rpa2","Gins2", "Ubr7","Uhrf1", "Fen1",
"Cdc45", "Rad51ap1", "Cdca7", "Dtl", "Nasp",
"Clspn", "Ung", "Usp1","Dscc1", "Prim1",
"Wdr76", "Tipin", "Mcm6","Pola1", "Mcm5",
"Blm", "Casp8ap2", "Tyms", "E2f8","Rfc2",
"Rad51", "Pcna","Ccne2", "Rrm1","Brip1")
g2m.mouse = c("Dlgap5","Ctcf", "Smc4", "Kif20b","Gtse1",
"Cdc25c","Cdca3","Tacc3","Ttk","Tpx2",
"Top2a","Cks1brt", "G2e3", "Ckap2","Hmgb2",
"Anp32e","Lbr","Ncapd2","Cks2", "Cdca2",
"Psrc1","Cenpe","Ckap5","Nuf2", "Ect2",
"Anln", "Kif2c","Cenpf","Cks1b","Rangap1",
"Birc5","Cdca8","Kif11","Hmmr", "Aurkb",
"Cdc20","Ndc80","Kif23","Nek2", "Hjurp",
"Bub1", "Nusap1","Aurka","Ccnb2","Tubb4b",
"Ube2c","Ckap2l","Cenpa","Mki67" )
seuratOb = CellCycleScoring(seuratOb,
s.features = s.mouse,
g2m.features = g2m.mouse,
set.ident = FALSE)
seuratOb <- SCTransform(seuratOb,
method = "glmGamPoi",
vars.to.regress = c("percent.mt", "S.Score", "G2M.Score"),
verbose = FALSE)
Perform dimensionality reduction by PCA and UMAP
seuratOb <- RunPCA(seuratOb, verbose = FALSE)
seuratOb <- RunUMAP(seuratOb,
dims = 1:30,
verbose = FALSE)
Cluster the cells with multiple resolution parameters and visualize the divisions with clustree
seuratOb <- FindNeighbors(seuratOb,
dims = 1:30,
verbose = FALSE)
# test resolutions from 0 to 1
res_vec = seq(0, 1.2, .1)
seuratOb <- FindClusters(seuratOb,
verbose = FALSE,
resolution = res_vec)
# save plot
if(params$goi == "none"){
pdf(file = paste0(results.dir, "/clustree_all_cells.pdf"),
width = 6,
height = 10)
print(clustree(seuratOb, prefix = "SCT_snn_res."))
dev.off()
} else{
pdf(file = paste0(results.dir, "/clustree_", params$goi,"_expressing.pdf"),
width = 6,
height = 10)
print(clustree(seuratOb, prefix = "SCT_snn_res."))
dev.off()
}
## png
## 2
# print clustree
clustree(seuratOb, prefix = "SCT_snn_res.")
Each row of circles on the clustree plot represents cells clustered at the resolution denoted by SCT_snn_res. Each circle represents a cluster. The arrows show how the cells move from cluster to cluster at different resolutions. in_prop show the proportion of the cells if the receiving cluster that came from the donating cluster. Make plots for all of them.
if(params$goi != "none"){
save(seuratOb,
file = paste0(data.dir,
"/",
params$project_name,
"_",
params$goi,
"_expressing_QC_filtered_SeuratObject.Rdata"))
save_path = paste0(data.dir,
"/",
params$project_name,
"_",
params$goi,
"_expressing_QC_filtered_SeuratObject.Rdata")
} else {
save(seuratOb,
file = paste0(data.dir,
"/",
params$project_name,
"_QC_filtered_SeuratObject.Rdata"))
save_path = paste0(data.dir,
"/",
params$project_name,
"_QC_filtered_SeuratObject.Rdata")
}
Saved here: /mnt/ufs18/home-115/hickeys6/Documents/Ralston/projects/Day17_Reprogramming_scRNA_seq/data/Day17_Reprogramming_scRNA_seq_QC_filtered_SeuratObject.Rdata
I want to check if the clusters represent known causes of variation between otherwise similar cells including:
# resolutions of interest
res_oi = res_vec[2:length(res_vec)]
# loop for making the plots
for(res in res_oi){
# set the identity
Idents(object = seuratOb) <- paste0("SCT_snn_res.", res)
# cluster plot
P1 = DimPlot(seuratOb, label = TRUE) + NoLegend()
# UMI counts
P2 =
VlnPlot(seuratOb,
features = "nCount_RNA",
pt.size = 0) +
NoLegend() +
ggtitle("UMI Counts") +
xlab("Cluster") +
geom_boxplot(width=0.1, color="black")
# n Genes
P3 =
VlnPlot(seuratOb,
features = "nFeature_RNA",
pt.size = 0) +
NoLegend() +
ggtitle("nGenes") +
xlab("Cluster") +
geom_boxplot(width=0.1, color="black")
# percent mito
P4 =
VlnPlot(seuratOb,
features = "percent.mt",
pt.size = 0) +
NoLegend() +
ggtitle("Percent mitochondrial reads") +
xlab("Cluster") +
geom_boxplot(width=0.1, color="black")
# select the resolution col of interest and the Phase col
select_cols = c(paste0("SCT_snn_res.", res), "Phase")
meta_oi = seuratOb@meta.data[,select_cols]
# tally the number of cells in each phase per cluster
cc_tally =
meta_oi %>%
group_by(across()) %>%
tally()
colnames(cc_tally) = c("Cluster", "Phase", "nCells")
# all the tally of phases across all cells
all_tally =
meta_oi %>%
group_by(Phase) %>%
tally() %>%
mutate(Cluster = "All.Cells") %>%
select(Cluster,
Phase,
n)
colnames(all_tally) = c("Cluster", "Phase", "nCells")
# r bind all cells and cluster-wise tally
cc_tally = rbind(cc_tally, all_tally)
# stacked bar plot with proportion of cells in each cell-cycle phase
P5 = ggplot(cc_tally,
aes(fill = Phase,
y = nCells,
x = Cluster)) +
geom_bar(position="fill",
stat="identity") +
theme_classic() +
ggtitle("Proportion cells in each phase") +
ylab("proportion of cells")
# combine the plots with patchwork
Pall =
((P1/P5) | (P2 + P3 + P4)) +
plot_annotation(title = paste0("resolution = ", res))
# Print the plot
print(Pall)
# save the plot
if(params$goi == "none"){
ggsave(Pall,
file = paste0(gene.dir,
"/all_cells_cluster_QC_plots_res",
res,
".png"),
width = 9,
height = 6,
dpi = 300)
} else {
ggsave(Pall,
file = paste0(gene.dir,
"/",
params$goi,
"_expressing_cells_cluster_QC_plots_res",
res, ".png"),
width = 9,
height = 6,
dpi = 300)
}
}
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
##
## Matrix products: default
## BLAS/LAPACK: /opt/software/OpenBLAS/0.3.7-GCC-8.3.0/lib/libopenblas_sandybridgep-r0.3.7.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Matrix_1.4-1 filesstrings_3.2.3 clustree_0.5.0 ggraph_2.0.6
## [5] patchwork_1.1.2 sp_1.5-0 SeuratObject_4.1.1 Seurat_4.1.1
## [9] forcats_0.5.2 stringr_1.4.0 dplyr_1.0.10 purrr_0.3.4
## [13] readr_2.1.2 tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6
## [17] tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 reticulate_1.26
## [3] tidyselect_1.1.2 htmlwidgets_1.5.4
## [5] grid_4.1.0 Rtsne_0.15
## [7] munsell_0.5.0 codetools_0.2-18
## [9] ragg_1.2.2 ica_1.0-3
## [11] future_1.28.0 miniUI_0.1.1.1
## [13] withr_2.5.0 spatstat.random_2.2-0
## [15] colorspace_2.0-3 progressr_0.11.0
## [17] Biobase_2.52.0 highr_0.9
## [19] knitr_1.40 stats4_4.1.0
## [21] ROCR_1.0-11 tensor_1.5
## [23] listenv_0.8.0 MatrixGenerics_1.4.0
## [25] labeling_0.4.2 GenomeInfoDbData_1.2.6
## [27] polyclip_1.10-0 farver_2.1.1
## [29] parallelly_1.32.1 vctrs_0.4.1
## [31] generics_0.1.3 xfun_0.32
## [33] R6_2.5.1 GenomeInfoDb_1.28.0
## [35] graphlayouts_0.8.1 bitops_1.0-7
## [37] spatstat.utils_2.3-1 cachem_1.0.6
## [39] DelayedArray_0.18.0 assertthat_0.2.1
## [41] promises_1.2.0.1 scales_1.2.1
## [43] googlesheets4_1.0.1 rgeos_0.5-9
## [45] gtable_0.3.0 globals_0.16.1
## [47] goftest_1.2-3 tidygraph_1.2.2
## [49] rlang_1.0.5 systemfonts_1.0.4
## [51] splines_4.1.0 lazyeval_0.2.2
## [53] gargle_1.2.0 spatstat.geom_2.4-0
## [55] broom_1.0.1 checkmate_2.1.0
## [57] yaml_2.3.5 reshape2_1.4.4
## [59] abind_1.4-5 modelr_0.1.9
## [61] backports_1.4.1 httpuv_1.6.5
## [63] tools_4.1.0 ellipsis_0.3.2
## [65] spatstat.core_2.4-4 jquerylib_0.1.4
## [67] RColorBrewer_1.1-3 BiocGenerics_0.38.0
## [69] ggridges_0.5.3 Rcpp_1.0.9
## [71] plyr_1.8.7 sparseMatrixStats_1.4.0
## [73] zlibbioc_1.38.0 RCurl_1.98-1.8
## [75] rpart_4.1-15 deldir_1.0-6
## [77] pbapply_1.5-0 viridis_0.6.2
## [79] cowplot_1.1.1 S4Vectors_0.30.0
## [81] zoo_1.8-9 SummarizedExperiment_1.22.0
## [83] haven_2.5.1 ggrepel_0.9.1
## [85] cluster_2.1.2 fs_1.5.2
## [87] magrittr_2.0.3 data.table_1.14.2
## [89] glmGamPoi_1.6.0 scattermore_0.8
## [91] lmtest_0.9-40 reprex_2.0.2
## [93] RANN_2.6.1 googledrive_2.0.0
## [95] fitdistrplus_1.1-8 matrixStats_0.62.0
## [97] hms_1.1.2 mime_0.12
## [99] evaluate_0.16 xtable_1.8-4
## [101] readxl_1.4.1 IRanges_2.26.0
## [103] gridExtra_2.3 compiler_4.1.0
## [105] KernSmooth_2.23-20 crayon_1.5.1
## [107] htmltools_0.5.3 mgcv_1.8-40
## [109] later_1.3.0 tzdb_0.3.0
## [111] lubridate_1.8.0 DBI_1.1.3
## [113] tweenr_2.0.2 dbplyr_2.2.1
## [115] MASS_7.3-58.1 cli_3.3.0
## [117] parallel_4.1.0 igraph_1.3.4
## [119] GenomicRanges_1.44.0 pkgconfig_2.0.3
## [121] plotly_4.10.0 spatstat.sparse_2.1-1
## [123] xml2_1.3.3 bslib_0.4.0
## [125] XVector_0.32.0 rvest_1.0.3
## [127] digest_0.6.29 sctransform_0.3.4
## [129] RcppAnnoy_0.0.19 spatstat.data_2.2-0
## [131] rmarkdown_2.16 cellranger_1.1.0
## [133] leiden_0.4.2 uwot_0.1.14
## [135] DelayedMatrixStats_1.14.0 shiny_1.7.2
## [137] lifecycle_1.0.1 nlme_3.1-159
## [139] jsonlite_1.8.0 viridisLite_0.4.0
## [141] fansi_1.0.3 pillar_1.8.1
## [143] lattice_0.20-45 fastmap_1.1.0
## [145] httr_1.4.4 survival_3.4-0
## [147] glue_1.6.2 png_0.1-7
## [149] ggforce_0.3.4 stringi_1.7.8
## [151] sass_0.4.2 textshaping_0.3.6
## [153] strex_1.4.3 irlba_2.3.5
## [155] future.apply_1.9.0